
################################################################################
################################################################################
###                                                                          ###
### --- Filename: 1.1. Data Preparation.R ---------------------------------- ###
###                                                                          ###
### --- Description: This is the R script for the data preparation for ----- ###
### --- our paper "Economic Inequality and Public Support for -------------- ###
### --- Redistribution in Europe: A Cross-Sectional and Longitudinal ------- ###
### --- Multilevel Analysis" submitted to the IJPOR ------------------------ ###
###                                                                          ###
### --- Author: Valentin Velev --------------------------------------------- ###
###                                                                          ###
### --- Date: 11.12.2023 --------------------------------------------------- ###
###                                                                          ###
################################################################################
################################################################################

################################################################################
### --- 1. Preparation ----------------------------------------------------- ###
################################################################################

# install and load packages
required_packages <- c(
  # packages for loading and wrangling data
  "tidyverse", "haven", "readxl", "sjmisc", "sjlabelled", "fastDummies",
  # packages for (multiple) imputation
  "naniar", "imputeTS", "jomo", "mitml",
  # packages for statistical models
  "lme4", "lmerTest", "broom.mixed",
  # packages for post-estimation
  "influence.ME", "easystats", "sjPlot", "jtools", "interplot", "interactions",
  # other packages
  "ggpubr", "qqplotr", "NCmisc", "rstatix", "statar"
)

for (p in required_packages){
  
  if(!require(p, character.only=T)) install.packages(p, dependencies=T)
  library(p, character.only=T)
  
}

rm(p, required_packages)

# set seed to ensure replicability
set.seed(2946)

# global options
options(digits=6, scipen=999, max.print=10000)

################################################################################
### --- 2. Load data ------------------------------------------------------- ###
################################################################################

# load and clean individual-level data from ESS files
ess_dat_raw <- 
  #load data from all ESS files
  bind_rows(
    read_dta("Data/ESS_main.dta") %>%
      remove_all_labels() %>%
      subset(select=-c(name, edition, proddate, idno, iscoco, isco08, dweight, 
                       pspwght, eduyrs, pweight, rlgatnd, rlgdgr, hhmmb, domicil, 
                       imbgeco, imueclt, imwbcnt)),
    read_dta("Data/ESS_2_italy.dta") %>%
      remove_all_labels() %>%
      subset(select=c(essround, cntry, gincdif, lrscale, health, gndr, agea, 
                      edulvla, hinctnt, marital, mbtru, mnactic)),
    read_dta("Data/ESS_3_latvia.dta") %>%
      remove_all_labels() %>%
      subset(select=c(essround, cntry, gincdif, lrscale, health, gndr, agea, 
                      edulvla, hinctnt, maritala, mbtru, mnactic)),
    read_dta("Data/ESS_3_romania.dta") %>%
      remove_all_labels() %>%
      subset(select=c(essround, cntry, gincdif, lrscale, health, gndr, agea, 
                      edulvla, hinctnt, maritala, mbtru, mnactic)),
    read_dta("Data/ESS_4_austria.dta") %>%
      remove_all_labels() %>%
      subset(select=c(essround, cntry, gincdif, lrscale, health, gndr, agea, 
                      edulvla, hinctnta, maritala, mbtru, mnactic)),
    read_dta("Data/ESS_4_lithuania.dta") %>%
      remove_all_labels() %>%
      subset(select=c(essround, cntry, gincdif, lrscale, health, gndr, agea, 
                      edulvla, hinctnta, maritala, mbtru, mnactic)),
    read_dta("Data/ESS_5_austria.dta") %>%
      remove_all_labels() %>%
      subset(select=c(essround, cntry, gincdif, lrscale, health, gndr, agea, 
                      edulvlb, hinctnta, maritalb, mbtru, mnactic)),
    read_dta("Data/ESS_9_romania.dta") %>%
      remove_all_labels() %>%
      subset(select=c(essround, cntry, gincdif, lrscale, health, gndr, agea, 
                      edulvlb, hinctnta, maritalb, mbtru, mnactic))
  ) %>%
  # convert missing values to NA
  sjlabelled::set_na(na=list(
    gincdif = c(7, 8, 9),
    lrscale = c(77, 88, 99),
    health = c(7, 8, 9),
    gndr = 9,
    edulvla = c(55, 77, 88, 99),
    edulvlb = c(5555, 7777, 8888, 9999),
    eduyrs = c(77, 88, 99),
    hinctnt = c(77, 88, 99),
    hinctnta = c(77, 88, 99),
    marital = c(7, 8, 9),
    maritala = c(77, 88, 99),
    maritalb = c(77, 88, 99),
    mbtru = c(7, 8, 9),
    mnactic = c(66, 77, 88, 99),
    agea = 999)
  )

################################################################################

# check structure of the data
str(ess_dat_raw)

# check distribution of all variables with for loop
vars <- names(ess_dat_raw)
list = list()
counter = ""

for (i in vars) {
  counter=i
  list[[counter]] <- as.data.frame(frq(ess_dat_raw[,i]))
  print(i)
  print(list[[counter]])
}

rm(vars, list, counter, i)

################################################################################

# load macro-level data
macrodata <- left_join(
  # load and re-code main macro-level data
  readxl::read_excel("Data/macrodata.xlsx") %>%
    mutate(gdp_pc = (gdp_pc / 1000)),
  # load and impute data on immigration
  readxl::read_excel("Data/immig_dat.xlsx") %>%
    imputeTS::na_interpolation(option="linear"),
  # how to join
  by=c("country", "year")
)

################################################################################
### --- 3. Wrangle and merge data ------------------------------------------ ###
################################################################################

# re-code variables not included in MI
ess_dat_adj <- ess_dat_raw %>%
  # rename variables
  dplyr::rename(age = agea,
                income_2002_2006 = hinctnt,
                income_2008_2018 = hinctnta
                ) %>%
  # re-coding country and year
  mutate(country = rec(cntry, rec="GB=UK; else=copy"),
         year = rec(essround, rec="1=2002; 2=2004; 3=2006; 4=2008; 5=2010;
                    6=2012; 7=2014; 8=2016; 9=2018"), 
         .before="lrscale"
         ) %>%
  # re-coding the DV
  mutate(redist = (gincdif * -1) + 5) %>%
  # re-coding the IVs
  mutate(male =  rec(gndr, rec="1=1; 2=0"),
         badhealth = rec(health, rec="3:5=1; else=0"),
         unionmember = rec(mbtru, rec="1=1; else=0"),
         employment = rec(mnactic, rec="1,7=1; 3,4=2; else=0")
         ) %>%
  # case-when re-coding of IVs
  mutate(married = case_when((marital == 1 & !is.na(marital)) |
                               (maritala == 1 | maritala == 2 & !is.na(maritala)) |
                               (maritalb == 1 | maritalb == 2 & !is.na(maritalb)) ~ 1,
                             TRUE & !is.na(marital) | !is.na(maritala) | !is.na(maritalb) ~ 0),
         edulvl = case_when((edulvla == 1 & !is.na(edulvla)) |
                              (edulvlb <= 129 & !is.na(edulvlb)) ~ 1,
                            (edulvla == 2 & !is.na(edulvla)) |
                              (edulvlb >= 212 & edulvlb <= 229 & !is.na(edulvlb)) ~ 2,
                            (edulvla == 3 & !is.na(edulvla)) |
                              (edulvlb >= 311 & edulvlb <= 323 & !is.na(edulvlb)) ~ 3,
                            (edulvla == 4 & !is.na(edulvla)) |
                              (edulvlb >= 412 & edulvlb <= 423 & !is.na(edulvlb)) ~ 4,
                            (edulvla == 5 & !is.na(edulvla)) |
                              (edulvlb >= 510 & !is.na(edulvlb)) ~ 5)
         ) %>%
  # create country-year variable
  mutate(country_year = paste0(country, year), .after="year") %>%
  # remove original variables
  subset(select=-c(cntry, essround, gndr, gincdif, marital, maritala, maritalb, 
                   edulvla, edulvlb, mbtru, mnactic, health))

################################################################################

# re-code income variable (into country-year-specific quantiles)
ess_dat_adj_income <- ess_dat_adj %>%
  group_by(country, year) %>%
  mutate(income_1 = rec(income_2008_2018, rec="1,2=1; 3,4=2; 5,6=3; 7,8=4; 9,10=5"),
         income_2 = statar::xtile(income_2002_2006, n=5),
         income = coalesce(income_1, income_2),
         .after="income_2008_2018"
         ) %>%
  subset(select=-c(income_2002_2006, income_2008_2018, income_1, income_2)) %>%
  ungroup()

################################################################################

# merge micro- and macro-level data and turn categorical variables into factors
data_with_NA <- left_join(ess_dat_adj_income, macrodata, by=c("country", "year"))

# check merged dataframe
str(data_with_NA)

################################################################################
### --- 4. Missing data ---------------------------------------------------- ###
################################################################################

# check NA distribution by variable
as.data.frame(naniar::miss_var_summary(data_with_NA))

# LWD
## create df without missings using LWD
data_lwd <- data_with_NA[complete.cases(data_with_NA),]

## check NA distribution
as.data.frame(naniar::miss_var_summary(data_lwd))

## check how many observations were deleted
(nrow(data_with_NA)-nrow(data_lwd)) # 140,561 obs
((nrow(data_with_NA)-nrow(data_lwd))/nrow(data_with_NA)*100) # or 35,65%

################################################################################
### --- 5. Create new variables for regression models ---------------------- ###
################################################################################

# country-year-mean redistribution variable
data <- data_lwd %>%
  group_by(country, year) %>%
  mutate(redist_C = mean(redist)) %>%
  ungroup()

# country-mean and -demeaned variables + grand mean centered individual-level variables
data <- data %>%
  de_mean(redist, income, lrscale, gini_disp, gdp_pc, unemp, left_gov, global_index, 
          immig, soc_exp, gini_mrkt, grp=country, suffix.dm="_we", suffix.gm="_be") %>%
  mutate(across(c(income, lrscale, age), function(x){x - mean(x)}, .names="{col}_cgm"))

# create group mean centered ("double centered") income variable
data <- data %>%
  group_by(country) %>%
  mutate(income_cwc = income_cgm - mean(income_cgm)) %>%
  ungroup()

# create df with factorized categorical variables for regression models
reg_dat <- data %>%
  mutate(across(c(year, male, employment, edulvl, married, badhealth, unionmember, elec_sys), as.factor))

################################################################################
### --- 6. Sample size ----------------------------------------------------- ###
################################################################################

# average, minimum, and maximum country-year sample size
avg <- data %>%
  group_by(country, year) %>%
  summarize(amnt = n())

min(avg$amnt)
max(avg$amnt)
mean(avg$amnt)

# individual-, country-year, and country-level sample size
nrow(data)
n_distinct(data$country_year)
n_distinct(data$country)

################################################################################
### --- 7. Save ------------------------------------------------------------ ###
################################################################################

save(data_with_NA, file="Environment/data_with_NA.RData")
save(data, file="Environment/data.RData")
save(reg_dat, file="Environment/reg_dat.RData")





